Web Scraping with Python

Author

Tony Duan

Introduction to Web Scraping with Python

Web scraping is the process of automatically extracting data from websites. Python is one of the most popular languages for web scraping due to its powerful libraries and simple syntax.

What You’ll Learn

  • Basic concepts of web scraping
  • Using popular Python libraries: requests, BeautifulSoup, Selenium
  • Best practices and ethical considerations
  • Real-world examples

Prerequisites

Make sure you have these libraries installed:

Code
pip install requests beautifulsoup4 selenium pandas lxml

Essential Libraries

Library Purpose Best For
requests HTTP requests Simple static websites
BeautifulSoup HTML parsing Parsing and extracting data
Selenium Browser automation Dynamic/JavaScript websites
pandas Data manipulation Organizing scraped data

Getting Started with Requests

The requests library is the foundation for most web scraping tasks.

Code
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Basic GET request
url = "http://books.toscrape.com/"
response = requests.get(url)

# Check if request was successful
if response.status_code == 200:
    print("Successfully fetched the page!")
    html_content = response.text
else:
    print(f"Error: {response.status_code}")
Successfully fetched the page!

Headers and User-Agent

Always set proper headers to mimic a real browser:

Code
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, headers=headers)

Parsing HTML with BeautifulSoup

BeautifulSoup makes it easy to navigate and extract data from HTML.

Code
from bs4 import BeautifulSoup

# Create BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Find book titles
book_titles = soup.find_all('h3')

# Find book prices
book_prices = soup.find_all('p', class_='price_color')

# Find all book containers
books = soup.find_all('article', class_='product_pod')

# Extract book information
for book in books[:3]:  # Show first 3 books as example
    title = book.h3.a['title']
    price = book.find('p', class_='price_color').text
    print(f"Title: {title}, Price: {price}")
Title: A Light in the Attic, Price: £51.77
Title: Tipping the Velvet, Price: £53.74
Title: Soumission, Price: £50.10

Advanced Selectors

Code
# CSS selectors for books
books = soup.select('.product_pod')

# Complex selectors for prices
prices = soup.select('p.price_color')

# Get parent/children elements - fix the iteration
for price in prices[:5]:  # Just use 5 price as example
    book_container = price.find_parent('article', class_='product_pod')
    print(f"Found parent container: {book_container.h3.a['title']}")
Found parent container: A Light in the Attic
Found parent container: Tipping the Velvet
Found parent container: Soumission
Found parent container: Sharp Objects
Found parent container: Sapiens: A Brief History of Humankind

Real-World Example: Scraping Book Information

Code
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_books(url):
    """Scrape book information from a bookstore website"""

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    books_data = []

    # Find all book containers
    books = soup.find_all('article', class_='product_pod')

    for book in books:
        title = book.h3.a['title']
        price = book.find('p', class_='price_color').text
        stock = book.find('p', class_='instock availability').text.strip()
        rating = book.p['class'][1]  # Second class contains rating

        books_data.append({
            'title': title,
            'price': price,
            'stock': stock,
            'rating': rating
        })

    return pd.DataFrame(books_data)

# Example usage
books_df = scrape_books('http://books.toscrape.com/')
print(books_df.head())
                                   title   price     stock rating
0                   A Light in the Attic  £51.77  In stock  Three
1                     Tipping the Velvet  £53.74  In stock    One
2                             Soumission  £50.10  In stock    One
3                          Sharp Objects  £47.82  In stock   Four
4  Sapiens: A Brief History of Humankind  £54.23  In stock   Five

Best Practices

1. Respect Robots.txt

Always check a website’s robots.txt file:

Code
import requests

def check_robots_txt(domain):
    robots_url = f"https://{domain}/robots.txt"
    response = requests.get(robots_url)
    return response.text

# Example: Check robots.txt for books.toscrape.com
# print(check_robots_txt("books.toscrape.com"))

2. Rate Limiting

Be respectful and don’t overwhelm servers:

Code
import time
import random

def polite_requests(urls, delay_range=(1, 3)):
    """Make requests with random delays"""
    for url in urls:
        response = requests.get(url)
        # Random delay between requests
        time.sleep(random.uniform(*delay_range))
        yield response

# Example usage:
# urls = [
#     "http://books.toscrape.com/catalogue/page-1.html",
#     "http://books.toscrape.com/catalogue/page-2.html",
#     "http://books.toscrape.com/catalogue/page-3.html"
# ]
# for response in polite_requests(urls):
#     print(f"Fetched: {response.url}")

3. Error Handling

Always implement proper error handling:

Code
def safe_request(url, max_retries=3):
    """Make HTTP requests with retry logic"""
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raise exception for bad status codes
            return response
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                raise

# Example usage:
# response = safe_request("http://books.toscrape.com/")
# print("Successfully fetched page!")

Common Challenges and Solutions

1. Dealing with Pagination

Code
def extract_books_from_page(html_content):
    """Extract book data from HTML content"""
    soup = BeautifulSoup(html_content, "html.parser")
    books = soup.find_all("article", class_="product_pod")

    page_data = []
    for book in books:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text
        stock = book.find("p", class_="instock availability").text.strip()
        rating = book.p["class"][1]

        page_data.append(
            {"title": title, "price": price, "stock": stock, "rating": rating}
        )

    return page_data


def scrape_multiple_pages(base_url, max_pages=5):
    """Scrape multiple pages of a website"""
    all_data = []

    for page in range(1, max_pages + 1):
        url = f"{base_url}catalogue/page-{page}.html"
        response = requests.get(url)

        if response.status_code == 200:
            # Extract data from current page
            page_data = extract_books_from_page(response.content)
            all_data.extend(page_data)
            print(f"Scraped {len(page_data)} books from page {page}")
        else:
            print(f"Failed to fetch page {page}")
            break

    return all_data


# Example usage:
books_data = scrape_multiple_pages("http://books.toscrape.com/", 3)
print(f"Scraped {len(books_data)} books from 3 pages")
Scraped 20 books from page 1
Scraped 20 books from page 2
Scraped 20 books from page 3
Scraped 60 books from 3 pages

2. Handling Forms and Authentication

Code
def login_and_scrape(login_url, target_url, credentials):
    """Handle login before scraping"""
    session = requests.Session()

    # Login
    login_data = {
        'username': credentials['username'],
        'password': credentials['password']
    }

    response = session.post(login_url, data=login_data)

    # Scrape protected content
    if response.status_code == 200:
        protected_response = session.get(target_url)
        return protected_response.content
Back to top